Notebook to recreate the results presented in the paper. Keep in mind, that the results can slighty differentiate as the random background will not create the exact same random sets as before.
import os
import sys
import json
import pandas as pd
from IPython.display import IFrame
# ==== import DIGEST python package called biodigest ====
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
from biodigest.single_validation import single_validation, save_results
from biodigest.evaluation.d_utils.plotting_utils import create_plots,create_extended_plots
import warnings
warnings.filterwarnings("ignore")
If it is the first time that this package is used after installation, make sure that setup has been run before to have all required files.
from biodigest import setup
setup.main(setup_type="api")
Compare a target cluster of diseases or genes based on dunn index, sillhouette score or david bouldin index, while the random runs are simply cluster size preserving perturbation of cluster assignments.
# ==== define required input ====
tar_cluster = pd.read_csv("input/target_gene_cluster.txt", header=None, sep="\t", dtype=str, names=["id", "cluster", "desc"])
tar_id_type = "symbol"
mode = "clustering"
# ==== define optional input influencing results ====
distance_measure="jaccard" # which distance measure should be chosen. Choice between jaccard and overlap
background_model="complete" # the only background model for clustering
runs = 1000 # how many random runs for empirical p-value estimation
perc = 100 # how many % of the original input should be perturbated for the background model
# ==== define optional input influencing saving of results ====
out_dir = "results/gene_cluster/"
verbose=True # printing additional information during the run
prefix="gene_cluster"
results = single_validation(tar=tar_cluster, tar_id=tar_id_type, mode=mode, background_model=background_model,
runs=runs, replace=perc, verbose=verbose, distance=distance_measure)
Check all P-values
pd.DataFrame(results["p_values"]['values'])
Check validation values of input cluster
pd.DataFrame(results["input_values"]['values'])
save_results(results=results, prefix=prefix, out_dir=out_dir)
If flag for plot was set, you can see a plot with the p-values for every attribute and a mappability plot showing how many of the input ids had assigned values per attribute.
create_plots(results=results, mode=mode, tar=tar_cluster, tar_id=tar_id_type, out_dir=out_dir, prefix=prefix)
create_extended_plots(results=results, mode=mode, tar=tar_cluster, out_dir=out_dir, prefix=prefix)
Keep in mind, that after plotting, the resulting plots are also saved in the output directory.
# ==== define required input ====
tar_cluster = pd.read_csv("input/target_disease_cluster.txt", header=None, sep="\t", dtype=str, names=["id", "cluster", "desc"])
tar_id_type = "ICD-10"
mode = "clustering"
# ==== define optional input influencing results ====
distance_measure="jaccard" # which distance measure should be chosen. Choice between jaccard and overlap
background_model="complete" # the only background model for clustering
runs = 1000 # how many random runs for empirical p-value estimation
perc = 100 # how many % of the original input should be perturbated for the background model
# ==== define optional input influencing saving of results ====
out_dir = "results/disease_cluster/"
verbose=True # printing additional information during the run
prefix="disease_cluster"
results = single_validation(tar=tar_cluster, tar_id=tar_id_type, mode=mode, background_model=background_model,
runs=runs, replace=perc, verbose=verbose, distance=distance_measure)
Check all P-values
pd.DataFrame(results["p_values"]['values'])
Check validation values of input cluster
pd.DataFrame(results["input_values"]['values'])
save_results(results=results, prefix=prefix, out_dir=out_dir)
If flag for plot was set, you can see a plot with the p-values for every attribute and a mappability plot showing how many of the input ids had assigned values per attribute.
create_plots(results=results, mode=mode, tar=tar_cluster, tar_id=tar_id_type, out_dir=out_dir, prefix=prefix)
create_extended_plots(results=results, mode=mode, tar=tar_cluster, out_dir=out_dir, prefix=prefix)
Keep in mind, that after plotting, the resulting plots are also saved in the output directory.
Validate a set by reference, either reference set or id, or reference-free. Two background models can be used.
# ==== define required input ====
tar_set = pd.read_csv("input/target_disease_set.txt", header=None, sep="\t", dtype=str)[0]
tar_id_type = "mesh"
mode = "set"
# ==== define optional input influencing results ====
distance_measure="jaccard" # which distance measure should be chosen. Choice between jaccard and overlap
background_model= "term-pres" # the only background model for clustering
runs = 1000 # how many random runs for empirical p-value estimation
perc = 100 # how many % of the original input should be perturbated for the background model
# ==== define optional input influencing saving of results ====
out_dir = "results/disease_set/"
verbose=True # printing additional information during the run
prefix="disease_set"
results = single_validation(tar=tar_set, tar_id=tar_id_type, mode=mode, background_model=background_model,
runs=runs, replace=perc, verbose=verbose, distance=distance_measure)
Check all P-values
pd.DataFrame(results["p_values"]['values'])
Check validation values of input cluster
pd.DataFrame(results["input_values"]['values'])
save_results(results=results, prefix=prefix, out_dir=out_dir)
If flag for plot was set, you can see a plot with the p-values for every attribute and a mappability plot showing how many of the input ids had assigned values per attribute.
create_plots(results=results, mode=mode, tar=tar_set, tar_id=tar_id_type, out_dir=out_dir, prefix=prefix)
create_extended_plots(results=results, mode=mode, tar=tar_set, out_dir=out_dir, prefix=prefix)
# ==== define required input ====
tar_set = pd.read_csv("input/target_gene_set.txt", header=None, sep="\t", dtype=str)[0]
tar_id_type = "symbol"
ref_set = pd.read_csv("input/reference_gene_set.txt", header=None, sep="\t", dtype=str)[0]
ref_id_type = "symbol"
mode = "set-set"
# ==== define optional input influencing results ====
distance_measure="jaccard" # which distance measure should be chosen. Choice between jaccard and overlap
background_model="term-pres" # the only background model for clustering
runs = 1000 # how many random runs for empirical p-value estimation
perc = 100 # how many % of the original input should be perturbated for the background model
enriched=False
# ==== define optional input influencing saving of results ====
out_dir = "results/gene_set/"
verbose=True # printing additional information during the run
prefix="gene_set"
results = single_validation(tar=tar_set, tar_id=tar_id_type, ref=ref_set, ref_id=ref_id_type, mode=mode,
runs=runs, background_model=background_model, verbose=verbose, enriched=enriched,
distance=distance_measure)
Check all P-values
pd.DataFrame(results["p_values"]['values'])
Check validation values of input cluster
pd.DataFrame(results["input_values"]['values'])
save_results(results=results, prefix=prefix, out_dir=out_dir)
If flag for plot was set, you can see a plot with the p-values for every attribute and a mappability plot showing how many of the input ids had assigned values per attribute.
create_plots(results=results, mode=mode, tar=tar_set, tar_id=tar_id_type, out_dir=out_dir, prefix=prefix)
create_extended_plots(results=results, mode=mode, tar=tar_set, out_dir=out_dir, prefix=prefix)
# ==== define required input ====
tar_set = pd.read_csv("input/target_gene_subnetwork.txt", header=None, sep=",", dtype=str)[0]
tar_id_type = "symbol"
network_data = {"network_file":"input/gene_network.graphml", "prop_name":"name", "id_type":"symbol"}
mode = "subnetwork"
# ==== define optional input influencing results ====
distance_measure="jaccard" # which distance measure should be chosen. Choice between jaccard and overlap
background_model="network" # the only background model for clustering
runs = 1000 # how many random runs for empirical p-value estimation
perc = 100 # how many % of the original input should be perturbated for the background model
enriched=True
# ==== define optional input influencing saving of results ====
out_dir = "results/gene_network/"
verbose=True # printing additional information during the run
prefix="gene_network"
results = single_validation(tar=tar_set, tar_id=tar_id_type, mode=mode,
runs=runs, background_model=background_model, verbose=verbose, enriched=enriched,
distance=distance_measure, network_data=network_data)
Check all P-values
pd.DataFrame(results["p_values"]['values'])
Check validation values of input cluster
pd.DataFrame(results["input_values"]['values'])
save_results(results=results, prefix=prefix, out_dir=out_dir)
If flag for plot was set, you can see a plot with the p-values for every attribute and a mappability plot showing how many of the input ids had assigned values per attribute.
create_plots(results=results, mode=mode, tar=tar_set, tar_id=tar_id_type, out_dir=out_dir, prefix=prefix)
create_extended_plots(results=results, mode=mode, tar=tar_set, out_dir=out_dir, prefix=prefix)